# Construct weights fixed across time and space
# JHL 

# Clean environment and initialize 
rm(list = ls(all = TRUE))

# Packages 
list.of.packages <- c("folderfun", "data.table", "bit64", "lubridate", "foreign", "dplyr", "ggplot2")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages,repos = "http://cran.us.r-project.org")

require(folderfun)
require("data.table")
require("lubridate")
require("parallel")
require("bit64")
require(foreign)
setNumericRounding(0)

# Setup paths
setff("path_home")
path_home <- ffpath_home()

repository <- sprintf("%s/dta/nielsen/sqg_posted_group",path_home)
do_path <- sprintf("%s/do/data_prep/01_price_index",path_home)

PI_functions_opt <- sprintf('%s/05_PIq_fns.R',do_path)

# Source functions
source(PI_functions_opt)

fw_out <- sprintf('%s/dta/nielsen/PIq_06_15/fix_weight/state_group',path_home)
load_marker <- sprintf('%s/dta/nielsen/PIq_06_15/fix_weight/load_marker',path_home)
store_marker <- sprintf('%s/dta/nielsen/PIq_06_15/fix_weight/store',path_home)
all_marker <- sprintf('%s/dta/nielsen/PIq_06_15/fix_weight/all_done',path_home)
partial <- sprintf('%s/dta/nielsen/PIq_06_15/fix_weight/partial',path_home)

	 # Create RData data table of file names and store codes 
	 if(!file.exists(sprintf("%s/sqg_0615_files.RData",do_path))) {
		
		 # Much more efficient listing method using system call
			# Can also sort by size with -S, then run arrays based on size 
		 ptm <- proc.time()
		 setwd(repository)

		 file.table <- system("ls -S", intern = TRUE)
		 file.table <- as.list(file.table)
		 
		 file.table <- lapply(file.table, function(x) paste(repository,"/",x,sep=""))
		 file.table <- data.table(unlist(file.table))
		 set.seed(10)
		 file.table[,':='(size_id=.I,id=sample(1:nrow(file.table), nrow(file.table), replace=F))]
		 setnames(file.table,"V1","file_name")
		
		file.table[ ,file:=tstrsplit(file_name,"/",fixed=T)[[length(tstrsplit(file_name,"/",fixed=T))]] ]
		file.table[ ,state:=tstrsplit(file,"_",fixed=T)[[1]] ]
		file.table[, file:=NULL]
		
		save(file.table, file = sprintf("%s/sqg_0615_files.RData",do_path)) # Save to RData for quick loading 		 
		 proc.time() - ptm

	}

	load(sprintf("%s/sqg_0615_files.RData",do_path))
	 
# Get the ID from the slurm array to have a variable name
# Naturally string, if use numerical value transform into numeric 
myID <- as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))

verbose = T 

prev.time <- verbose.Print.Time(verbose, "List locations")

setkey(file.table,id)

quant = quantile(unique(file.table$id),seq(0,1,by=0.01))
file.table = file.table[id>=quant[[myID]] & id<=quant[[myID+1]],]

# Options for collapse opt: Default F is by store
    core = 8
    marker <- 1L # L assigns integer value instead of numerical, saves memory 

	for(i in file.table$id){

		# Format: state_group (state is quantile for parallelization) 
		temp <- strsplit(file.table[id==i,file_name],split="/")
		sg <- temp[[1]][length(temp[[1]])]
		state <- as.numeric(strsplit(sg,split="_")[[1]][1])
		rds <- strsplit(sg,split="_")[[1]][2]
		group <- as.numeric(strsplit(rds,split="[.]")[[1]][1])
		
		# Track progress 
		INIT <- sprintf('%s/%d_%s.csv', load_marker, state, group)
		STORE <- sprintf('%s/%d_%s.csv', store_marker, state, group)
		ALL <- sprintf('%s/%d_%s.csv', all_marker, state,group)
		PARTIAL <- sprintf('%s/%d_%s.csv', partial, state,group)
		if((file.exists(INIT) & file.exists(STORE) )) { 
			cat(sprintf(' State : %d & group : %s entire skip', state, group))
			cat('\n')
			next
		}
			
		cat(sprintf('loading: %d state & %s group', state, group))
		cat('\n')
		write.csv(marker, file = INIT, row.names= FALSE)
		prev.time <- verbose.Print.Time(T, sprintf('loading: %d state & %s group', state, group))
			
		## Load 06-15 file
		dt_state_save <- readRDS(file.table[id==i,file_name])
		dt_state_save[,Volume:=Units*Price]
		
		## Run if non-empty
		if (nrow(dt_state_save) == 0) {
			
			cat(sprintf('State : %d & group %s: NULL_dt', state, group))
			cat('\n')
			write.csv(marker, file = STORE, row.names= FALSE)

			next
		}
		
		# Return same data table with only locations that are not missing in any year 
		dt_store <- geo.level.check(dt_state_save, "by_store")
		
		if (nrow(dt_store) < 4) {
			cat(sprintf('State : %d & group : %s < 4', state, group))
			cat('\n')
			write.csv(marker, file = STORE, row.names= FALSE)
	
			gc()
			next
		}
			
		# Create list of fixed weights in each file before any other output to expedite production of fixed weights
			
		# Calculate number of quarters available for each store x upc 
		setkey(dt_state_save, store_code_uc,upc,upc_ver_uc)
		dt_state_save[, groupkey := .GRP, by = key(dt_state_save)]
		dt_state_save[, num_quarter := .N, by = (groupkey)]
		
		dt_state_save[, ':=' (VUL = sum(Volume)), by = .(store_code_uc,num_quarter)]
		dt_state_save[, ':=' (TVC = sum(Volume)), by = (store_code_uc)]
		
		# Weights fixed over time using initial period, balanced products only 
		w_ft_40 <- dt_state_save[num_quarter == 40 & year == 2006, ]
		
		if(nrow(w_ft_40) == 0) {
			write.csv(marker, file = STORE, row.names= FALSE)
			next
		}
		
		# Create fixed weights 
		w_ft <- fast.subset(w_ft_40, "by_store")
		w_ft <- mcMap(Fix_weight, w_ft$Y2006, 
						mc.set.seed=TRUE, mc.cores = core)						
		w_ft <- rbindlist(w_ft)
		
		# Merge in revenue ratios
		w_ft_40 <- w_ft_40[quarter==1,]
		# setkey(w_ft, store_code_uc, upc)
		setkey(w_ft, store_code_uc, upc,upc_ver_uc)
		setkey(w_ft_40, store_code_uc, upc,upc_ver_uc)
		w_ft <- w_ft[w_ft_40, nomatch = 0]  	
		w_ft <- w_ft[,.(store_code_uc,upc,upc_ver_uc,q_fix,s_fix,VUL,TVC)]
		
		
		save(w_ft,file=sprintf('%s/%d_%s.RData',fw_out, state, group))
		
		write.csv(marker, file = STORE, row.names= FALSE)
		
		gc()
		
		next
		
		# Report progress
		load(sprintf('%s/%d_%s.RData',fw_out, state, group))
		
		cat(sprintf('%s: saving started', group))
		cat('\n')
		
		write.csv(marker , file = ALL , row.names= F)
		
		gc()
        
    }
	
cat('Done')	
    





















